nusmods at https://api.nusmods.com/..JSON format, convert to a dataframe.myBid <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2011:2018)) # looping through each year
{
for(semester in c(1,2)) # loopinig through semesters
{
if(year == 2017 & semester == 2) # there is no cors biding data for 2017/2018 sem 2
{
} else if(year == 2018 & semester == 2) # there is no cors biding data for 2018/2019 sem 2
{
} else
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/corsBiddingStatsRaw.json")
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]][["ModuleCode"]], "^PL"))) # only keep info if module code begins with PL
{
if(myjson[[r]][["Semester"]] == "1" | myjson[[r]][["Semester"]] == "2") # only get semester 1 and 2 information
{
myBid <- rbind(myBid, myjson[[r]]) # add to dataframe
}
}
myjson[[r]] <- NA # replace the element with NA to free up some rAM
}
}
cat(year, "Semester", semester, "Done!") # progress tracker
}
}
saveRDS(myBid, file = "myBid.RDS") # save to directorymyBid.RDSmyBid.RDS and load it directly from my local folder while I worked on the project.myModInfo <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2011:2018)) # looping through each year
{
for(semester in c(1,2))
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/moduleTimetableDeltaRaw.json")
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2) # only get semester 1 and 2 information
{
myModInfo <- rbind(myModInfo, myjson[[r]]) # add to dataframe
}
}
myjson[[r]] <- NA # replace the element with NA to free up some rAM
}
cat(year, "Semester", semester, "Done!") # progress tracker
}
}
myTitles <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2014:2018)) # looping through each year
{
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/moduleList.json") # create the url where data is to be extracted from
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(paste0(myjson[[r]]$Semester, collapse = "|") == "1"|
paste0(myjson[[r]]$Semester, collapse = "|") == "2"|
paste0(myjson[[r]]$Semester, collapse = "|") == "1|2") # only keep information from semester 1 and 2
{
myTitles <- rbind(myTitles, as.data.frame(myjson[[r]])) # add to dataframe
}
}
myjson[[r]] <- NA # free RAM
}
}
myModInfo <- myTitles %>% # add titles information to myModInfo
select(ModuleCode, ModuleTitle) %>% # select these two columns
filter(ModuleTitle != "Lab in Applied Psychology") %>%
distinct() %>% # remove duplicates
right_join(myModInfo, by = "ModuleCode") # left = myTitles, right = myModInfo
saveRDS(myModInfo, file = "myModInfo.RDS") # save to directorymyModInfo.RDSmyModInfo.RDS and load the data directly while I worked on the project.myModInfo.
myModInfo <- myModInfo %>%
select(-LastModified, -LastModified_js, -isDelete) %>% # remove these columns
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!is.na(ModuleTitle)) %>% # removing modules without module titles #PL3285, PL4220, PL4217
filter(LessonType != "TUTORIAL") %>% # removing information about tutorials
select(AcadYear, Semester, ModuleCode, ModuleTitle, DayText, StartTime, Semester, ClassNo) %>%# select these columns
distinct(AcadYear, Semester, ModuleCode, ClassNo, DayText, StartTime, .keep_all = TRUE) # remove duplicates
modrow <- nrow(myModInfo) # get number of rows of myMoInfo
myModInfo <- myModInfo %>%
mutate(rowindex = 1:modrow) %>% # create new row that is the row number
arrange(-rowindex) %>% # invert the dataframe, make it upside down, reason: latest entry are appended to the bottom of the dataframe!
distinct(AcadYear, Semester, ModuleCode, ClassNo, .keep_all = TRUE) %>% # remove duplicates based on these columns
select(-rowindex) # remove rowindex
tail(myModInfo) # peekmyBid.
myModInfo.myBid <- myBid %>%
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!str_detect(ModuleCode, "PLS|PLB")) %>% # remove PLS and PLB modules
filter(!str_detect(StudentAcctType, "Reserved")) %>% # remove reserved rounds
filter(!str_detect(StudentAcctType, "[G]")) %>% # remove bidding information from non-psychology students
select(-Faculty) %>% # remove this columns
mutate(Group1 = gsub("-", "", Group)) %>% # remove hyphens such that it works with parse_number()
mutate(ClassNo = as.character(parse_number(Group1))) # new column signifying which lecture slots for modules with >1 lecture slots
head(myBid) # peekmyModInfo and myBid.# modules that do not appear in both dataframes are dropped
mydata <- inner_join(myBid,
myModInfo,
by = c("ModuleCode", "AcadYear", "Semester", "ClassNo"))
head(mydata) # peek# transform these columns to numeric
for(r in c("Quota", "Bidders", "LowestBid", "LowestSuccessfulBid", "HighestBid", "StartTime"))
{
mydata[,grep(r, names(mydata))] <- as.numeric(mydata[,grep(r, names(mydata))])
}
# transform these columns to factors
for(r in c("AcadYear", "Semester", "ModuleCode", "Round", "StudentAcctType", "DayText", "StudentAcctType", "ModuleTitle", "Group$", "ClassNo"))
{
mydata[,grep(r, names(mydata))] <- factor(mydata[,grep(r, names(mydata))])
}DayText LevelsStudentAcctType LevelsGroup Levels# create new variable that indicates the level of the module, based on their module code
mydata$Level <- factor(ifelse(str_detect(mydata$ModuleCode, "1[0-9][0-9][0-9]"), "Level 1",
ifelse(str_detect(mydata$ModuleCode, "2[0-9][0-9][0-9]"), "Level 2",
ifelse(str_detect(mydata$ModuleCode, "3[0-9][0-9][0-9]"), "Level 3",
ifelse(str_detect(mydata$ModuleCode, "4[0-9][0-9][0-9]"), "Level 4",
"Graduate Module")))))mydata$Category <- ifelse(str_detect(mydata$ModuleCode, "^PL328"), "Lab",
ifelse(str_detect(mydata$ModuleCode, "^PL4[0-9][0-9][0-9]"), "Honor",
ifelse(str_detect(mydata$ModuleCode, "PL323[2-6]|PL1101|PL213[1-2]"), "Core", "Elective")))
mydata$Category <- factor(mydata$Category,
levels = c("Core", "Elective", "Lab", "Honor"))# create vector of the column names which are factors
facnames <- mydata %>% select_if(is.factor) %>% names()
# facnames without ModuleCode and StudentAcctType
facnames.mod <- facnames[-grep("ModuleCode|ModuleTitle", facnames)]
# create vector ofthe column names which are numeric
numnames <- mydata %>% select_if(is.numeric) %>% names()
# numnames without StartTime
numnames.time <- names(select_if(mydata, is.numeric))[-grep("StartTime", numnames)]Bidders is calculated across all academic years, all bidding rounds, all modules…## 'data.frame': 1934 obs. of 20 variables:
## $ AcadYear : Factor w/ 8 levels "2011/2012","2012/2013",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Semester : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ Round : Factor w/ 7 levels "1A","1B","1C",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ModuleCode : Factor w/ 87 levels "PL1101E","PL2131",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Group : Factor w/ 4 levels "LEC1","LEC2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Quota : num 95 430 5 12 35 35 28 50 25 22 ...
## $ Bidders : num 10 100 3 42 8 3 7 2 8 5 ...
## $ LowestBid : num 1 1 1 205 1 1 1 1 1 1 ...
## $ LowestSuccessfulBid: num 1 1 1 977 1 1 1 1 1 1 ...
## $ HighestBid : num 500 1150 368 1255 500 ...
## $ StudentAcctType : Factor w/ 4 levels "New[P]","NUS[P]",..: 3 1 3 1 3 1 3 1 3 1 ...
## $ Group1 : chr "LECTURE 1" "LECTURE 1" "LECTURE 1" "LECTURE 1" ...
## $ ClassNo : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ModuleTitle : Factor w/ 85 levels "Abnormal Psychology",..: 34 34 74 74 75 75 8 8 13 13 ...
## $ DayText : Factor w/ 5 levels "Monday","Tuesday",..: 1 1 3 3 2 2 2 2 3 3 ...
## $ StartTime : num 1800 1800 1600 1600 800 800 1200 1200 1400 1400 ...
## $ Level : Factor w/ 4 levels "Level 1","Level 2",..: 1 1 2 2 2 2 3 3 3 3 ...
## $ BidPerQuota : num 0.105 0.233 0.6 3.5 0.229 ...
## $ Period : Factor w/ 2 levels "Morning",">=Afternoon": 2 2 2 2 1 1 2 2 2 2 ...
## $ Category : Factor w/ 4 levels "Core","Elective",..: 1 1 1 1 1 1 1 1 1 1 ...
## Warning in describe(mydata): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
# plot the categorical variables
for(r in facnames.mod)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(stat = "count") +
ylab("Count") +
ggtitle(paste0("Count of ", r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_blank(),
legend.position = "none")
)
}# plot the continuous variables
for(r in numnames)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(fill = "violetred", alpha = 0.5, bins = 50) +
ylab("Histogram") +
ggtitle(paste0(r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_text())
)
}for(r in 1:length(facnames.mod)) # loop across all factors
{
for(i in 1:length(facnames.mod)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
tempform <- paste0("~ ", facnames.mod[r], " + ", facnames.mod[i]) # create formula for xtabs
# temp is a dataframe that is only going to exist in this section and overwritten with each loop
temp <- as.data.frame(xtabs(eval(parse(text = tempform)),
data = mydata,
subset = NULL))
plot(
ggplot(data = temp, aes_string(x = facnames.mod[r], y = facnames.mod[i], fill = "Freq", label = "Freq")) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "white", high = "violetred") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
legend.position = "none")
)
}
}
}for(r in 1:length(numnames)) # loop across all numeric columns
{
for(i in 1:length(numnames)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
# create formulas for lm()
tempform.std <- paste0("scale(", numnames[i],")", " ~ ", "scale(", numnames[r], ")") # standardized
tempform <- paste0(numnames[i], " ~ ", numnames[r]) # unstandardized
# regress to get best fit line
stdreg <- lm(eval(parse(text = tempform.std)),
data = mydata) # standardized
reg <- lm(eval(parse(text = tempform)),
data = mydata) # unstandardized
plot(
ggplot(data = mydata, aes_string(x = numnames[r], y = numnames[i])) +
geom_point(color = "violetred", size = 2, alpha = 0.3) +
theme_classic() +
geom_abline(slope = reg$coefficients[2], intercept = reg$coefficients[1], lty = "dashed") +
geom_label(aes(x = Inf, y = Inf, label = paste0("Standardized Regression Coefficient = ",
round(stdreg$coefficients[2],3)),
hjust = 1, vjust = 2)) +
theme(axis.text.x = element_text(angle = 90))
)
}
}
}for(r in facnames.mod) # loop across all factor columns
{
for(i in numnames) # inner loop across all numeric columns
{
plot(
ggplot(data = mydata, aes_string(x = r, y = i, fill = r)) +
geom_boxplot() +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
)
}
}for(r in c("Core", "Elective", "Lab", "Honor"))
{
plot(mydata %>%
filter(Category == r) %>%
filter(Round == "1A") %>%
mutate(ModuleCode = fct_reorder(ModuleCode, Quota)) %>%
ggplot(mapping = aes(Quota)) +
stat_ecdf(geom = "step", size = 0.5) +
ylab("Cumulative Distribution") +
ggtitle(r) +
theme_classic() +
theme(legend.position = "none",
title = element_text(size = 7)))
}for(r in sort(as.character(unique(mydata$ModuleCode))))
{
plot(
mydata %>%
filter(ModuleCode == r) %>%
filter(Round == "1A") %>%
mutate(AcadSem = paste0(AcadYear,"/S" ,Semester)) %>%
group_by(ModuleCode, AcadSem) %>%
summarize(meanQuota = mean(Quota)) %>%
ggplot(mapping = aes(x = AcadSem, y = meanQuota, group = ModuleCode)) +
geom_point() +
geom_path() +
ggtitle(r) +
ylim(c(0,200)) +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 12))
)
}for(r in sort(as.character(unique(mydata$ModuleCode))))
{
plot(
mydata %>%
filter(ModuleCode == r) %>%
filter(Round == "1A") %>%
mutate(AcadSem = paste0(AcadYear,"/S" ,Semester)) %>%
group_by(ModuleCode, AcadSem) %>%
summarize(meanLSB = mean(LowestSuccessfulBid)) %>%
ggplot(mapping = aes(x = AcadSem, y = meanLSB, group = ModuleCode)) +
geom_point() +
geom_path() +
ggtitle(r) +
ylim(c(0,2200)) +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 12))
)
}Lets look at each module and compare the average number of bidders, bidders per quota and lowest successful bids when the lecture begins in and after the morning.
for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(ModuleCode, ModuleTitle, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 12)) +
facet_grid(~ ModuleCode:ModuleTitle, labeller = label_wrap_gen(width = 25)) +
ggtitle(r))
}for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(Level, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", color = "black"),
strip.text = element_text(color = "white", size = 12)) +
facet_wrap(~ Level) +
ggtitle(r))
}StartTimeHrs12 And Group-Mean Centered Hrs12.
Inadvertently, we began to evaluate the demand or popularity of each module to guide our bidding choices. We might even advise juniors or peers based on such evaluations. For example:
Psychological Therapies is the most popular module, so you need to plan ahead and stockpile points from previous semesters if you plan on bidding for them.
The above originated from observing peers grief over their inability to secure a place in Psychological Therapies due to the exhorbitant amount of points required (which required students to stockpile points from previous semesters). But I have never heard anyone claiming that they really wanted to study Cognitive Neuroscience but failed to bid for it.
But was it true that Psychological Therapies was the most popular module? Rather than inferring popularity from personal anecdotes and observation, do we have data to support this claim? The answer is yes! Past bidding statistics and other module information are available at https://nusmods.com/api/. All thanks to the team at NUSMods who created a great timetabling tool for all NUS students. With these data, we can pitch the question broader and ask,
What were the most popular modules?
The information was downloaded, extracted, transformed, analysed and visualized using R. The codes are available under Codes tab above. The API contains extracted data for all modules from different majors and faculty but I will focus only on Psychology modules in this post as I have greater familiarity with them.
For the typical Psychology major, there are broadly four categories of modules.
| Categories | Description |
|---|---|
| Core Modules | Modules that are required for all undergraduates. Includes PL1101E, PL2131, PL2132, PL3232 to PL3236. |
| Level 3 Elective Modules | Modules that are outside of the core modules. Between four to six of these are required by all undergraduates to graduate. Their module codes run from PL3237 to PL3260. |
| Level 3 Lab Modules | Lab modules are structured as individual or group research projects in a specific domain of Psychology. Every undergraduate is required to complete at least one of these modules. Their module codes are prefixed with PL328x. |
| Level 4 Honor Modules | Modules that are required to graduate on the Honors track, usually taken near the end of the undergraduate degree. Between five to eight of these are required to graduate. They are prefixed with PL4xxx. |
Core modules were usually simple to get and would most likely be allocated to you in the Module Preference Exercise (more on that later…). Within the other three categories, what were the most popular modules?
To proceed, we would need some consensus on what popularity is and how to it. Luckily, the data contained bidding statistics that could be indicators of popularity. These are the key bidding statistics/variables:
We define a popular modulea as possessing the following characteristic in Round 1A (the first round of bidding):
Modules that do not fit criteria 1. and 2. will not be considered popular. Amongst these modules, 3. and 4. will be used to determine which modules were most popular.
The elective modules were not filtered by median Quota or BpQ because the Quota for elective varied greatly across different modules unlike the Lab and Honors modules. Here